# Load in rooted phylogenetic tree!
load("data/03_Phylogenetic_Tree/phytree_preprocessed_physeq.RData")
midroot_physeq_rm456## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2676 taxa and 89 samples ]
## sample_data() Sample Data: [ 89 samples by 47 sample variables ]
## tax_table() Taxonomy Table: [ 2676 taxa by 9 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 2676 tips and 2675 internal nodes ]
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2676 taxa and 89 samples ]
## sample_data() Sample Data: [ 89 samples by 47 sample variables ]
## tax_table() Taxonomy Table: [ 2676 taxa by 9 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 2676 tips and 2674 internal nodes ]
# Calculate the total number of reads per sample.
raw_TotalSeqs_df <-
midroot_physeq_rm456 %>%
# calculate the sample read sums
sample_sums() %>%
data.frame()
# name the column
colnames(raw_TotalSeqs_df)[1] <- "TotalSeqs"
head(raw_TotalSeqs_df)## TotalSeqs
## 20210602-MA-ABB1F 4024
## 20210602-MA-ABB1P 3086
## 20210602-MA-ABB1W 5719
## 20210602-MA-ABS1F 3824
## 20210602-MA-ABS1P 3743
## 20210602-MA-ABS1W 4742
# make histogram of raw reads
raw_TotalSeqs_df %>%
ggplot(aes(x = TotalSeqs)) +
geom_histogram(bins = 50) +
scale_x_continuous(limits = c(0, 10000)) +
labs(title = "Raw Sequencing Depth Distribution") +
theme_classic()## Warning: Removed 2 rows containing missing values (`geom_bar()`).
raw_rooted_physeq <-
midroot_physeq_rm456 %>%
# remove lowly seq sample that was outlier in alpha diversity analysis
subset_samples(names != "20210615-MA-ABB2F") %>%
# any asvs unique to this sample will also be removed
prune_taxa(taxa_sums(.) > 0, .)
# Inspect
raw_rooted_physeq## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2669 taxa and 88 samples ]
## sample_data() Sample Data: [ 88 samples by 47 sample variables ]
## tax_table() Taxonomy Table: [ 2669 taxa by 9 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 2669 tips and 2668 internal nodes ]
## [1] 2194
### scale_reads function
####################################################################################
# Function to scale reads: http://deneflab.github.io/MicrobeMiseq/
# Scales reads by
# 1) taking proportions
# 2) multiplying by a given library size of n
# 3) rounding
# Default for n is the minimum sample size in your library
# Default for round is floor
matround <- function(x){trunc(x+0.5)}
scale_reads <- function(physeq, n = min(sample_sums(physeq)), round = "round") {
# transform counts to n
physeq.scale <- transform_sample_counts(physeq, function(x) {(n * x/sum(x))})
# Pick the rounding functions
if (round == "floor"){
otu_table(physeq.scale) <- floor(otu_table(physeq.scale))
} else if (round == "round"){
otu_table(physeq.scale) <- round(otu_table(physeq.scale))
} else if (round == "matround"){
otu_table(physeq.scale) <- matround(otu_table(physeq.scale))
}
# Prune taxa and return new phyloseq object
physeq.scale <- prune_taxa(taxa_sums(physeq.scale) > 0, physeq.scale)
return(physeq.scale)
}This is where one might decide use rarefaction to normalize data.
## [1] 2194
# Scale reads by the above function
scaled_rooted_physeq <-
raw_rooted_physeq %>%
scale_reads(round = "matround")
# Calculate the read depth
scaled_TotalSeqs_df <-
scaled_rooted_physeq %>%
sample_sums() %>%
data.frame()
colnames(scaled_TotalSeqs_df)[1] <-"TotalSeqs"
# Inspect
head(scaled_TotalSeqs_df)## TotalSeqs
## 20210602-MA-ABB1F 2192
## 20210602-MA-ABB1P 2184
## 20210602-MA-ABB1W 2195
## 20210602-MA-ABS1F 2189
## 20210602-MA-ABS1P 2197
## 20210602-MA-ABS1W 2195
## [1] 2180
## [1] 2225
## [1] 45
# Plot Histogram
scaled_TotalSeqs_df %>%
ggplot(aes(x = TotalSeqs)) +
geom_histogram(bins = 50) +
scale_x_continuous(limits = c(0, 10000)) +
labs(title = "Scaled Sequencing Depth at 2194") +
theme_classic()## Warning: Removed 2 rows containing missing values (`geom_bar()`).
Exploratory analyses from Paliy & Shankar (2016) paper, which is using unconstrained ordination methods like PCoA.
# Calculate sorensen dissimilarity: Abundance-unweighted of shared taxa
scaled_soren_pcoa <-
ordinate(
physeq = scaled_rooted_physeq,
method = "PCoA",
distance = "bray", binary = TRUE)
#str(scaled_soren_pcoa)
# Plot the ordination
soren_station_pcoa <- plot_ordination(
physeq = scaled_rooted_physeq,
ordination = scaled_soren_pcoa,
color = "station",
shape = "station",
title = "Sorensen PCoA") +
geom_point(size=5, alpha = 0.5, aes(color = station)) +
scale_shape_manual(values = c(15,16,17,18, 19, 20)) +
scale_color_manual(values = station_colors) +
theme_bw()
# Show the plot
soren_station_pcoaNote that I have removed the PERMANOVA test to below the ordinations. We will come back to it later!
Here, we are evaluating the shared taxa by presence/absence (abundance-unweighted) in the Sorensen metric.
Note that there is a weird arch or horseshoe-effect in the data. This is likely because we have complete species turnover between our stations. If you have this in your data, please take a look at the paper by Morton et al., 2017. More on this in NMDS…
Note that:
This means we explain 36% of the variation in the data in these two axes.
# Calculate the BC distance
scaled_BC_pcoa <-
ordinate(
physeq = scaled_rooted_physeq,
method = "PCoA",
distance = "bray")
# Plot the PCoA
bray_station_pcoa <-
plot_ordination(
physeq = scaled_rooted_physeq,
ordination = scaled_BC_pcoa,
color = "station",
shape = "station",
title = "Bray-Curtis PCoA") +
geom_point(size=5, alpha = 0.5, aes(color = station)) +
scale_shape_manual(values = c(15,16,17,18, 19, 20)) +
scale_color_manual(values = c(station_colors)) +
theme_bw()
bray_station_pcoaHere, we are evaluating the shared taxa and then weighting them by their abundances, which provides more influence for species that are more abundant.
Note that there is a weird arch or horseshoe-effect in the data. This is likely because we have complete species turnover between our stations. If you have this in your data, please take a look at the paper by Morton et al., 2017. More on this in NMDS…
Note that:
This means we explain 48% of the variation in the data in these two axes, which is more than the previous plot with the Sorensen Dissimilarity. Abundance does seem to have an influence!!
It also looks like the samples are now separating more within each group than they did a bit more with Sorensen. Please note this with how the Copano West samples are looking between the Bray-Curtis and the Sorensen plots.
# Calculate the BC distance
scaled_wUNI_pcoa <-
ordinate(
physeq = scaled_rooted_physeq,
method = "PCoA",
distance = "wunifrac")
# Plot the PCoA
wUNI_station_pcoa <-
plot_ordination(
physeq = scaled_rooted_physeq,
ordination = scaled_wUNI_pcoa,
color = "station",
shape = "station",
title = "Weighted Unifrac PCoA") +
geom_point(size=5, alpha = 0.5, aes(color = station)) +
scale_shape_manual(values = c(15,16,17,18, 19, 20)) +
scale_color_manual(values = c(station_colors)) +
theme_bw()
wUNI_station_pcoaHere, we are evaluating the shared taxa and then weighting them by their abundances, which provides more influence for species that are more abundant.
Note that there is a weird arch or horseshoe-effect in the data. This is likely because we have complete species turnover between our stations. If you have this in your data, please take a look at the paper by Morton et al., 2017. More on this in NMDS…
Note that:
This means we explain 70% of the variation in the data in these two axes (!!!), which is significantly more than the previous plots with the taxonomic dissimilarity measures. Here, phylogeny seems to be very important! This means that taxa that are abundant are found in different places in the phylogenetic tree. Therefore, the evoultionary distances (aka the branch lengths) and their abundances seem to have a major influence!!
It also looks like the samples are now separating more within each group than they did a bit more with Sorensen. Please note this with how the Copano West samples are looking between the Bray-Curtis and the Sorensen plots.
Let’s plot all three together into one plot to have a concise visualization of the three metrics.
(soren_station_pcoa + theme(legend.position = "none")) +
(bray_station_pcoa + theme(legend.position = "none")) +
(wUNI_station_pcoa + theme(legend.position = "none"))Since we did 3 of the dissimilarity metrics for the PCoA, let’s just plot one example of them for the NMDS plotting. Here, we will use weighted Unifrac
# Calculate the Weighted Unifrac distance
scaled_wUNI_nmds <-
ordinate(
physeq = scaled_rooted_physeq,
method = "NMDS",
distance = "wunifrac")## Run 0 stress 0.08850802
## Run 1 stress 0.1820551
## Run 2 stress 0.08850781
## ... New best solution
## ... Procrustes: rmse 0.0002277753 max resid 0.001768078
## ... Similar to previous best
## Run 3 stress 0.2077088
## Run 4 stress 0.08850869
## ... Procrustes: rmse 0.001242072 max resid 0.008811312
## ... Similar to previous best
## Run 5 stress 0.08850773
## ... New best solution
## ... Procrustes: rmse 0.0001066012 max resid 0.0008692466
## ... Similar to previous best
## Run 6 stress 0.187642
## Run 7 stress 0.2135437
## Run 8 stress 0.08850789
## ... Procrustes: rmse 8.698477e-05 max resid 0.0006694909
## ... Similar to previous best
## Run 9 stress 0.08850788
## ... Procrustes: rmse 0.0001231422 max resid 0.00100307
## ... Similar to previous best
## Run 10 stress 0.08850795
## ... Procrustes: rmse 0.000102928 max resid 0.0008064815
## ... Similar to previous best
## Run 11 stress 0.0885085
## ... Procrustes: rmse 0.001312513 max resid 0.008988158
## ... Similar to previous best
## Run 12 stress 0.1992754
## Run 13 stress 0.1682702
## Run 14 stress 0.08850869
## ... Procrustes: rmse 0.001282848 max resid 0.00895487
## ... Similar to previous best
## Run 15 stress 0.08850805
## ... Procrustes: rmse 0.0001822032 max resid 0.001499852
## ... Similar to previous best
## Run 16 stress 0.08850798
## ... Procrustes: rmse 0.0001135539 max resid 0.000870369
## ... Similar to previous best
## Run 17 stress 0.1923846
## Run 18 stress 0.08850795
## ... Procrustes: rmse 0.0001606615 max resid 0.001241155
## ... Similar to previous best
## Run 19 stress 0.08850874
## ... Procrustes: rmse 0.001272308 max resid 0.008930962
## ... Similar to previous best
## Run 20 stress 0.1471595
## *** Best solution repeated 10 times
# Plot the PCoA
wUNI_station_nmds <-
plot_ordination(
physeq = scaled_rooted_physeq,
ordination = scaled_wUNI_nmds,
color = "station",
shape = "station",
title = "Weighted Unifrac NMDS") +
geom_point(size=5, alpha = 0.5, aes(color = station)) +
scale_shape_manual(values = c(15,16,17,18, 19, 20)) +
scale_color_manual(values = c(station_colors)) +
theme_bw()
wUNI_station_nmdsWe can see from above the plot that the stress value is ~0.15, which is just barely at the limit of the acceptable stress value. And, It seems important to emphasize that the PCoA and the NMDS plot both look pretty similar!
In this case, I would always prefer to report the PCoA results because they are linear and provide a lot more post-hoc analyses to follow up with. In addition, it’s helpful to only have 2 axes of variation and show how much variation is explained.
(wUNI_station_pcoa + theme(legend.position = "none")) +
(wUNI_station_nmds + theme(legend.position = "none"))# Calculate all three of the distance matrices
scaled_sorensen_dist <- phyloseq::distance(scaled_rooted_physeq, method = "bray", binary = TRUE)
scaled_bray_dist <- phyloseq::distance(scaled_rooted_physeq, method = "bray")
scaled_wUnifrac_dist <- phyloseq::distance(scaled_rooted_physeq, method = "wunifrac")
# make a data frame from the sample_data
# All distance matrices will be the same metadata because they
# originate from the same phyloseq object.
metadata <- data.frame(sample_data(scaled_rooted_physeq))
# Adonis test
# In this example we are testing the hypothesis that the five stations
# that were collected have different centroids in the ordination space
# for each of the dissimilarity metrics, we are using a discrete variable
adonis2(scaled_sorensen_dist ~ station, data = metadata)## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = scaled_sorensen_dist ~ station, data = metadata)
## Df SumOfSqs R2 F Pr(>F)
## station 4 6.3716 0.29265 8.5848 0.001 ***
## Residual 83 15.4007 0.70735
## Total 87 21.7723 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = scaled_bray_dist ~ station, data = metadata)
## Df SumOfSqs R2 F Pr(>F)
## station 4 7.3069 0.38689 13.094 0.001 ***
## Residual 83 11.5794 0.61311
## Total 87 18.8863 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = scaled_wUnifrac_dist ~ station, data = metadata)
## Df SumOfSqs R2 F Pr(>F)
## station 4 1.1233 0.45288 17.176 0.001 ***
## Residual 83 1.3571 0.54712
## Total 87 2.4804 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Note that:
Above, we see that the most variation is explained by the weighted unifrac, which explains ~45% of the variation in the data and also has the highest F-statistic.
# We might also care about other variables
# Here, we will add date and fraction as variables
# multiplicative model ORDER MATTERS!
adonis2(scaled_sorensen_dist ~ station * date * fraction, data = metadata)## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = scaled_sorensen_dist ~ station * date * fraction, data = metadata)
## Df SumOfSqs R2 F Pr(>F)
## station 4 6.3716 0.29265 12.7392 0.001 ***
## date 2 2.3935 0.10993 9.5710 0.001 ***
## fraction 2 0.5172 0.02376 2.0683 0.002 **
## station:date 8 3.9412 0.18102 3.9399 0.001 ***
## station:fraction 8 0.9169 0.04212 0.9167 0.693
## date:fraction 4 0.5037 0.02313 1.0070 0.443
## station:date:fraction 16 1.7514 0.08044 0.8754 0.894
## Residual 43 5.3767 0.24695
## Total 87 21.7723 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = scaled_bray_dist ~ station * date * fraction, data = metadata)
## Df SumOfSqs R2 F Pr(>F)
## station 4 7.3069 0.38689 23.2209 0.001 ***
## date 2 2.0117 0.10651 12.7859 0.001 ***
## fraction 2 0.7505 0.03974 4.7698 0.001 ***
## station:date 8 3.4783 0.18417 5.5270 0.001 ***
## station:fraction 8 0.7016 0.03715 1.1149 0.280
## date:fraction 4 0.3056 0.01618 0.9713 0.502
## station:date:fraction 16 0.9490 0.05025 0.7540 0.965
## Residual 43 3.3827 0.17911
## Total 87 18.8863 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Note that the ORDER MATTERS!
adonis2(scaled_wUnifrac_dist ~ station * date * fraction, data = metadata)## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = scaled_wUnifrac_dist ~ station * date * fraction, data = metadata)
## Df SumOfSqs R2 F Pr(>F)
## station 4 1.12332 0.45288 27.9816 0.001 ***
## date 2 0.17318 0.06982 8.6277 0.001 ***
## fraction 2 0.22887 0.09227 11.4023 0.001 ***
## station:date 8 0.32313 0.13027 4.0245 0.001 ***
## station:fraction 8 0.07703 0.03106 0.9594 0.530
## date:fraction 4 0.02648 0.01068 0.6596 0.824
## station:date:fraction 16 0.09684 0.03904 0.6031 0.995
## Residual 43 0.43156 0.17399
## Total 87 2.48042 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = scaled_wUnifrac_dist ~ date * station * fraction, data = metadata)
## Df SumOfSqs R2 F Pr(>F)
## date 2 0.17259 0.06958 8.5984 0.001 ***
## station 4 1.12391 0.45311 27.9963 0.001 ***
## fraction 2 0.22887 0.09227 11.4023 0.001 ***
## date:station 8 0.32313 0.13027 4.0245 0.001 ***
## date:fraction 4 0.02633 0.01062 0.6559 0.852
## station:fraction 8 0.07718 0.03111 0.9612 0.549
## date:station:fraction 16 0.09684 0.03904 0.6031 0.989
## Residual 43 0.43156 0.17399
## Total 87 2.48042 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We can also run tests that include additive (+) or multipliciatve models, which include the interaction term between variables.
The PERMANOVA is sensitive to variance/dispersion in the data. Therefore, we need to run a homogeneity of dispersion test to test for the sensitivity of our PERMANOVA results to variance.
# Homogeneity of Disperson test with beta dispr
# Sorensen
beta_soren_station <- betadisper(scaled_sorensen_dist, metadata$station)
permutest(beta_soren_station)##
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
##
## Response: Distances
## Df Sum Sq Mean Sq F N.Perm Pr(>F)
## Groups 4 0.18429 0.046073 6.3863 999 0.001 ***
## Residuals 83 0.59878 0.007214
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Bray-curtis
beta_bray_station <- betadisper(scaled_bray_dist, metadata$station)
permutest(beta_bray_station)##
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
##
## Response: Distances
## Df Sum Sq Mean Sq F N.Perm Pr(>F)
## Groups 4 0.37117 0.092794 7.0268 999 0.001 ***
## Residuals 83 1.09607 0.013206
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Weighted Unifrac
beta_bray_station <- betadisper(scaled_wUnifrac_dist, metadata$station)
permutest(beta_bray_station)##
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
##
## Response: Distances
## Df Sum Sq Mean Sq F N.Perm Pr(>F)
## Groups 4 0.042101 0.010525 4.195 999 0.005 **
## Residuals 83 0.208244 0.002509
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Above, our variance is impacted by station. Therefore, we need to be very careful about what we conclude about our data.
# Set the phylum colors
phylum_colors <- c(
Acidobacteriota = "navy",
Actinobacteriota = "darkslategray2",
Armatimonadota = "deeppink1",
Alphaproteobacteria = "plum2",
Bacteroidota = "gold",
Betaproteobacteria = "plum1",
Bdellovibrionota = "red1",
Chloroflexi="black",
Crenarchaeota = "firebrick",
Cyanobacteria = "limegreen",
Deltaproteobacteria = "grey",
Desulfobacterota="magenta",
Firmicutes = "#3E9B96",
Gammaproteobacteria = "greenyellow",
"Marinimicrobia (SAR406 clade)" = "yellow",
Myxococcota = "#B5D6AA",
Nitrospirota = "palevioletred1",
Proteobacteria = "royalblue",
Planctomycetota = "darkorange",
"SAR324 clade(Marine group B)" = "olivedrab",
#Proteobacteria_unclassified = "greenyellow",
Thermoplasmatota = "green",
Verrucomicrobiota = "darkorchid1")
# Other = "grey")# Calculate the phylum relative abundance
# Note: The read depth MUST be normalized in some way: scale_reads
phylum_df <-
scaled_rooted_physeq %>%
# agglomerate at the phylum level
tax_glom(taxrank = "Phylum") %>%
# Transform counts to relative abundance
transform_sample_counts(function (x) {x/sum(x)}) %>%
# Melt to a long format
psmelt() %>%
# Filter out Phyla that are <1% - get rid of low abundant Phyla
dplyr::filter(Abundance > 0.01) %>%
# fix the order of date
mutate(date = fct_relevel(date, c("6/2/21", "6/15/21", "10/5/21")),
station = fct_relevel(station, c("Copano West", "Copano East",
"Mesquite Bay", "Aransas Bay",
"Shipping Channel")))
# Stacked Bar Plot With All phyla
# Plot Phylum Abundances - make sure to load phylum_colors
phylum_df %>%
# Warning: It's important to have one sample per x value,
# otherwise, it will take the sum between multiple samples
dplyr::filter(depth == 0.0) %>%
dplyr::filter(fraction == "Whole") %>%
ggplot(aes(x = station, y = Abundance, fill = Phylum)) +
facet_grid(.~date) +
geom_bar(stat = "identity", color = "black") +
labs(title = "Surface Phylum Composition") +
scale_fill_manual(values = phylum_colors) +
theme_bw() +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1))## Make each phyla it's own row
phylum_df %>%
# Warning: It's important to have one sample per x value,
# otherwise, it will take the sum between multiple samples
dplyr::filter(depth == 0.0) %>%
dplyr::filter(fraction == "Whole") %>%
ggplot(aes(x = station, y = Abundance, fill = Phylum)) +
facet_grid(Phylum~date, scale = "free") +
geom_bar(stat = "identity", color = "black") +
labs(title = "Surface Phylum Composition") +
scale_fill_manual(values = phylum_colors) +
theme_bw() +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
strip.text.y = element_text(size = 5))# Narrow in on a specific group
# Actinobacteriota - y: abundance, x: station, dot plot + boxplot
phylum_df %>%
dplyr::filter(Phylum == "Actinobacteriota") %>%
# build the plot
ggplot(aes(x = station, y = Abundance,
fill = station, color = station)) +
geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot
geom_jitter() +
theme_bw() +
labs(title = "Actinobacteriota Phylum Abundance") +
scale_color_manual(values = station_colors) +
scale_fill_manual(values = station_colors) +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
legend.position = "right")# Calculate the Family relative abundance
# Note: The read depth MUST be normalized in some way: scale_reads
family_df <-
scaled_rooted_physeq %>%
# agglomerate at the phylum level
tax_glom(taxrank = "Family") %>%
# Transform counts to relative abundance
transform_sample_counts(function (x) {x/sum(x)}) %>%
# Melt to a long format
psmelt() %>%
# Filter out Phyla that are <1% - get rid of low abundant Phyla
dplyr::filter(Abundance > 0.01) %>%
# fix the order of date
mutate(date = fct_relevel(date, c("6/2/21", "6/15/21", "10/5/21")),
station = fct_relevel(station, c("Copano West", "Copano East",
"Mesquite Bay", "Aransas Bay",
"Shipping Channel")))
# Check family_df
#str(family_df)
# Plot family
# Actinobacteriota Families
family_df %>%
dplyr::filter(Phylum == "Actinobacteriota") %>%
# build the plot
ggplot(aes(x = station, y = Abundance,
fill = station, color = station)) +
facet_wrap(.~Family, scales = "free_y", nrow = 1) +
geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot
geom_jitter() +
theme_bw() +
labs(title = "Actinobacteriota Family Abundance") +
scale_color_manual(values = station_colors) +
scale_fill_manual(values = station_colors) +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
legend.position = "bottom")# Plot family
family_df %>%
dplyr::filter(Phylum == "Cyanobacteria") %>%
# build the plot
ggplot(aes(x = station, y = Abundance,
fill = station, color = station)) +
facet_wrap(.~Family, scales = "free_y", nrow = 1) +
geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot
geom_jitter() +
theme_bw() +
labs(title = "Cyanobacteria Family Abundance") +
scale_color_manual(values = station_colors) +
scale_fill_manual(values = station_colors) +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
legend.position = "bottom")# Calculate the Family relative abundance
# Note: The read depth MUST be normalized in some way: scale_reads
genus_df <-
scaled_rooted_physeq %>%
# agglomerate at the phylum level
tax_glom(taxrank = "Genus") %>%
# Transform counts to relative abundance
transform_sample_counts(function (x) {x/sum(x)}) %>%
# Melt to a long format
psmelt() %>%
# Filter out Phyla that are <1% - get rid of low abundant Phyla
dplyr::filter(Abundance > 0.01) %>%
# fix the order of date
mutate(date = fct_relevel(date, c("6/2/21", "6/15/21", "10/5/21")),
station = fct_relevel(station, c("Copano West", "Copano East",
"Mesquite Bay", "Aransas Bay",
"Shipping Channel")))
# Actinobacteriota
# Plot genus
genus_df %>%
dplyr::filter(Phylum == "Actinobacteriota") %>%
# build the plot
ggplot(aes(x = station, y = Abundance,
fill = station, color = station)) +
facet_wrap(.~Genus, scales = "free_y", nrow = 1) +
geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot
geom_jitter() +
theme_bw() +
labs(title = "Actinobacteriota Genus Abundance") +
scale_color_manual(values = station_colors) +
scale_fill_manual(values = station_colors) +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
legend.position = "bottom")# Cyanobacteria
# Plot genus
genus_df %>%
dplyr::filter(Phylum == "Cyanobacteria") %>%
# build the plot
ggplot(aes(x = station, y = Abundance,
fill = station, color = station)) +
facet_wrap(.~Genus, scales = "free_y", nrow = 1) +
geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot
geom_jitter() +
theme_bw() +
labs(title = "Cyanobacteria Genus Abundance") +
scale_color_manual(values = station_colors) +
scale_fill_manual(values = station_colors) +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
legend.position = "bottom")For reproducibility
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.3.2 (2023-10-31)
## os macOS Sonoma 14.4.1
## system aarch64, darwin20
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/New_York
## date 2024-05-08
## pandoc 3.1.1 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## ade4 1.7-22 2023-02-06 [1] CRAN (R 4.3.0)
## ape 5.7-1 2023-03-13 [1] CRAN (R 4.3.0)
## Biobase 2.62.0 2023-10-26 [1] Bioconductor
## BiocGenerics 0.48.1 2023-11-02 [1] Bioconductor
## biomformat 1.30.0 2023-10-26 [1] Bioconductor
## Biostrings 2.70.1 2023-10-26 [1] Bioconductor
## bitops 1.0-7 2021-04-24 [1] CRAN (R 4.3.0)
## bslib 0.6.1 2023-11-28 [1] CRAN (R 4.3.1)
## cachem 1.0.8 2023-05-01 [2] CRAN (R 4.3.0)
## cli 3.6.2 2023-12-11 [1] CRAN (R 4.3.1)
## cluster 2.1.6 2023-12-01 [1] CRAN (R 4.3.1)
## codetools 0.2-19 2023-02-01 [2] CRAN (R 4.3.2)
## colorspace 2.1-0 2023-01-23 [2] CRAN (R 4.3.0)
## crayon 1.5.2 2022-09-29 [2] CRAN (R 4.3.0)
## data.table 1.14.10 2023-12-08 [1] CRAN (R 4.3.1)
## devtools * 2.4.5 2022-10-11 [1] CRAN (R 4.3.0)
## digest 0.6.33 2023-07-07 [2] CRAN (R 4.3.0)
## dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.3.1)
## ellipsis 0.3.2 2021-04-29 [2] CRAN (R 4.3.0)
## evaluate 0.23 2023-11-01 [1] CRAN (R 4.3.1)
## fansi 1.0.6 2023-12-08 [1] CRAN (R 4.3.1)
## farver 2.1.1 2022-07-06 [2] CRAN (R 4.3.0)
## fastmap 1.1.1 2023-02-24 [2] CRAN (R 4.3.0)
## forcats * 1.0.0 2023-01-29 [2] CRAN (R 4.3.0)
## foreach 1.5.2 2022-02-02 [1] CRAN (R 4.3.0)
## fs 1.6.3 2023-07-20 [2] CRAN (R 4.3.0)
## generics 0.1.3 2022-07-05 [2] CRAN (R 4.3.0)
## GenomeInfoDb 1.38.5 2023-12-30 [1] Bioconductor 3.18 (R 4.3.2)
## GenomeInfoDbData 1.2.11 2024-01-10 [1] Bioconductor
## ggplot2 * 3.4.4 2023-10-12 [1] CRAN (R 4.3.1)
## glue 1.7.0 2024-01-09 [1] CRAN (R 4.3.1)
## gtable 0.3.4 2023-08-21 [2] CRAN (R 4.3.0)
## highr 0.10 2022-12-22 [2] CRAN (R 4.3.0)
## hms 1.1.3 2023-03-21 [2] CRAN (R 4.3.0)
## htmltools 0.5.7 2023-11-03 [1] CRAN (R 4.3.1)
## htmlwidgets 1.6.4 2023-12-06 [1] CRAN (R 4.3.1)
## httpuv 1.6.13 2023-12-06 [1] CRAN (R 4.3.1)
## igraph 1.6.0 2023-12-11 [1] CRAN (R 4.3.1)
## IRanges 2.36.0 2023-10-26 [1] Bioconductor
## iterators 1.0.14 2022-02-05 [1] CRAN (R 4.3.0)
## jquerylib 0.1.4 2021-04-26 [2] CRAN (R 4.3.0)
## jsonlite 1.8.8 2023-12-04 [1] CRAN (R 4.3.1)
## knitr 1.45 2023-10-30 [1] CRAN (R 4.3.1)
## labeling 0.4.3 2023-08-29 [2] CRAN (R 4.3.0)
## later 1.3.2 2023-12-06 [1] CRAN (R 4.3.1)
## lattice * 0.22-5 2023-10-24 [1] CRAN (R 4.3.1)
## lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.3.1)
## lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.3.1)
## magrittr 2.0.3 2022-03-30 [2] CRAN (R 4.3.0)
## MASS 7.3-60 2023-05-04 [2] CRAN (R 4.3.2)
## Matrix 1.6-4 2023-11-30 [1] CRAN (R 4.3.1)
## memoise 2.0.1 2021-11-26 [2] CRAN (R 4.3.0)
## mgcv 1.9-1 2023-12-21 [1] CRAN (R 4.3.1)
## mime 0.12 2021-09-28 [2] CRAN (R 4.3.0)
## miniUI 0.1.1.1 2018-05-18 [1] CRAN (R 4.3.0)
## multtest 2.58.0 2023-10-26 [1] Bioconductor
## munsell 0.5.0 2018-06-12 [2] CRAN (R 4.3.0)
## nlme 3.1-164 2023-11-27 [1] CRAN (R 4.3.1)
## pacman 0.5.1 2019-03-11 [2] CRAN (R 4.3.0)
## patchwork * 1.2.0 2024-01-08 [1] CRAN (R 4.3.1)
## permute * 0.9-7 2022-01-27 [1] CRAN (R 4.3.0)
## phyloseq * 1.46.0 2023-10-24 [1] Bioconductor
## pillar 1.9.0 2023-03-22 [2] CRAN (R 4.3.0)
## pkgbuild 1.4.3 2023-12-10 [1] CRAN (R 4.3.1)
## pkgconfig 2.0.3 2019-09-22 [2] CRAN (R 4.3.0)
## pkgload 1.3.3 2023-09-22 [1] CRAN (R 4.3.1)
## plyr 1.8.9 2023-10-02 [1] CRAN (R 4.3.1)
## profvis 0.3.8 2023-05-02 [1] CRAN (R 4.3.0)
## promises 1.2.1 2023-08-10 [1] CRAN (R 4.3.0)
## purrr * 1.0.2 2023-08-10 [2] CRAN (R 4.3.0)
## R6 2.5.1 2021-08-19 [2] CRAN (R 4.3.0)
## Rcpp 1.0.12 2024-01-09 [1] CRAN (R 4.3.1)
## RCurl 1.98-1.14 2024-01-09 [1] CRAN (R 4.3.1)
## readr * 2.1.4 2023-02-10 [2] CRAN (R 4.3.0)
## remotes 2.4.2.1 2023-07-18 [2] CRAN (R 4.3.0)
## reshape2 1.4.4 2020-04-09 [2] CRAN (R 4.3.0)
## rhdf5 2.46.1 2023-12-02 [1] Bioconductor 3.18 (R 4.3.2)
## rhdf5filters 1.14.1 2023-12-16 [1] Bioconductor 3.18 (R 4.3.2)
## Rhdf5lib 1.24.1 2023-12-12 [1] Bioconductor 3.18 (R 4.3.2)
## rlang 1.1.2 2023-11-04 [1] CRAN (R 4.3.1)
## rmarkdown 2.25 2023-09-18 [1] CRAN (R 4.3.1)
## rstudioapi 0.15.0 2023-07-07 [2] CRAN (R 4.3.0)
## S4Vectors 0.40.2 2023-11-25 [1] Bioconductor 3.18 (R 4.3.2)
## sass 0.4.8 2023-12-06 [1] CRAN (R 4.3.1)
## scales 1.3.0 2023-11-28 [1] CRAN (R 4.3.1)
## sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0)
## shiny 1.8.0 2023-11-17 [1] CRAN (R 4.3.1)
## stringi 1.8.3 2023-12-11 [1] CRAN (R 4.3.1)
## stringr * 1.5.1 2023-11-14 [1] CRAN (R 4.3.1)
## survival 3.5-7 2023-08-14 [2] CRAN (R 4.3.2)
## tibble * 3.2.1 2023-03-20 [2] CRAN (R 4.3.0)
## tidyr * 1.3.0 2023-01-24 [2] CRAN (R 4.3.0)
## tidyselect 1.2.0 2022-10-10 [2] CRAN (R 4.3.0)
## tidyverse * 2.0.0 2023-02-22 [2] CRAN (R 4.3.0)
## timechange 0.2.0 2023-01-11 [2] CRAN (R 4.3.0)
## tzdb 0.4.0 2023-05-12 [2] CRAN (R 4.3.0)
## urlchecker 1.0.1 2021-11-30 [1] CRAN (R 4.3.0)
## usethis * 2.2.2 2023-07-06 [1] CRAN (R 4.3.0)
## utf8 1.2.4 2023-10-22 [1] CRAN (R 4.3.1)
## vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.3.1)
## vegan * 2.6-4 2022-10-11 [1] CRAN (R 4.3.0)
## withr 2.5.2 2023-10-30 [1] CRAN (R 4.3.1)
## xfun 0.41 2023-11-01 [1] CRAN (R 4.3.1)
## xtable 1.8-4 2019-04-21 [1] CRAN (R 4.3.0)
## XVector 0.42.0 2023-10-26 [1] Bioconductor
## yaml 2.3.8 2023-12-11 [1] CRAN (R 4.3.1)
## zlibbioc 1.48.0 2023-10-26 [1] Bioconductor
##
## [1] /Users/mls528/Library/R/arm64/4.3/library
## [2] /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library
##
## ──────────────────────────────────────────────────────────────────────────────